# setup
import pandas as pd
import numpy as np
import folium
from sqlalchemy import create_engine
db_string = connection_string
db = create_engine(db_string)
# reading table from database
properties = pd.read_sql_table("NewNewHouseScrape", con=db)
properties.head()
properties.shape
# removing columns we won't need for this analysis
properties.drop(["index", "inspection", "listing_tag", "listing_details_features",
"listing_details_description", "url", "listing_details_insights"],
axis=1, inplace=True)
# removing empty rows and rows with missing key details (address) from our dataframe
properties = properties[properties["postcode"] != ""]
properties.reset_index(drop=True, inplace=True)
# copying original street column for debugging
# properties.insert(loc=1, column="street_og", value=properties["street"])
# converting all text in street column to lowercase
properties["street"] = properties["street"].str.lower()
# streamlining street suffix format
suffix_dictionary = {"st,": "street",
"cr,": "crescent",
"cl,": "close",
"cct,": "circuit",
"ct,": "circuit",
"dr,": "drive",
"rd,": "road",
"ave,": "avenue",
"blvd": "boulevard"}
for key, value in suffix_dictionary.items():
properties["street"] = properties["street"].str.replace(key, value)
# removing any unwanted characters
properties["street"] = properties["street"].str.replace('"',"'")
pattern = r"[.*(),]"
properties["street"] = properties["street"].str.replace(pattern, "")
# removing any unwanted white space
properties["street"] = properties["street"].str.replace(" ", " ").str.strip()
# manual cleaning of errors encountered in the dataset
properties["street"] = properties["street"].str.replace("pacfic", "pacific")
properties.insert(loc=2, column="street_number", value=properties["street"])
pattern_one = r"\s([a-z]+)"
properties["street_number"] = properties["street_number"].str.replace(pattern_one, "")
pattern_two = r"[a-z]{2,}"
properties["street_number"] = properties["street_number"].str.replace(pattern_two, "")
properties["street_number"] = properties["street_number"].str.replace("'", "").str.strip()
properties.insert(loc=3, column="street_name", value=properties["street"])
pattern_three = r"\d+"
properties["street_name"] = properties["street_name"].str.replace(pattern_three, "")
properties["street_name"] = properties["street_name"].str.replace("lots?", "")
pattern_four = r"[&/-]"
properties["street_name"] = properties["street_name"].str.replace(pattern_four, "")
pattern_five = r"\b[b-n,p-z]{1,2}\b"
properties["street_name"] = properties["street_name"].str.replace(pattern_five, "")
pattern_six = r"\b[ab]\b"
properties["street_name"] = properties["street_name"].str.replace(pattern_six, "").str.strip()
properties.insert(loc=4, column="clean_street",
value=properties["street_number"] + " " + properties["street_name"])
properties.drop(["street"], axis=1, inplace=True)
properties.sample(5)
Some listings in our dataset (mainly the ones advertising land) don't have entries for the number of bedrooms, bathrooms and parking spots, and have been scraped with the property size listed in the bedroom column while the size column is empty.
# examples:
properties[properties["prop_type"] == "Vacant land"].sample(3)
Let's correct this by creating a new size column which pulls the size from the bed column where required and from the size column otherwise.
size_update_list = []
for i, row in properties.iterrows():
if "m" in row["bed"]:
size_update_list.append(row["bed"])
elif "ha" in row["bed"]:
size_update_list.append(row["bed"])
else:
size_update_list.append(row["size"])
size_updated = pd.DataFrame(size_update_list, columns=["size_updated"])
properties = pd.concat([properties, size_updated], axis=1)
properties.sample(3)
# creating a copy of the original size_updated column for debugging
# properties["size_og"] = properties["size_updated"]
# removing special characters
properties["size_updated"] = properties["size_updated"].str.replace(",", "").str.strip()
# removing the measurement unit for all entries in square metres
properties["size_updated"] = properties["size_updated"].str.replace("m²", "")
# creating a new column for the cleaned size values
pattern = r"ha"
clean_size = []
for value in properties["size_updated"]:
# converting hectar entries to square metres and removing the measurement unit
if pattern in value:
value = value.replace("ha", "")
value = float(value)
value = value * 10000
clean_size.append(value)
else:
clean_size.append(value)
properties["clean_size"] = clean_size
# converting the values in the clean_size column to numerical form
properties["clean_size"] = pd.to_numeric(properties["clean_size"], errors="coerce")
# dropping the original size and updated_size columns
properties.drop(["size", "size_updated"], axis=1, inplace=True)
properties.sample(5)
# creating a copy of the original columns for debugging
# properties["og_bed"] = properties["bed"]
# properties["og_bath"] = properties["bath"]
# properties["og_parking"] = properties["parking"]
# removing size values from bed column
pattern_x = r".+ha"
pattern_y = r".+m²"
properties["bed"] = properties["bed"].str.replace(pattern_x, "")
properties["bed"] = properties["bed"].str.replace(pattern_y, "")
# removing any letters
pattern_1 = r"[a-z]+"
columns = ["bed", "bath", "parking"]
for column in columns:
properties[column] = properties[column].str.lower().str.replace(pattern_1, "")
# additional cleaning for the bed column
pattern_2 = r"[.,−]"
pattern_3 = r"\d{3,}"
properties["bed"] = properties["bed"].str.replace(pattern_2, "")
properties["bed"] = properties["bed"].str.replace(pattern_3, "").str.strip()
# additional cleaning for the bath column
properties["bath"] = properties["bath"].str.replace("−", "").str.strip()
# additional cleaning for the parking column
properties["parking"] = properties["parking"].str.replace("−", "0").str.strip()
# converting the clean values to numerical form
for column in columns:
properties[column] = pd.to_numeric(properties[column], errors="coerce")
properties.sample(5)
# creating a copy of the price column for debugging
properties["price_og"] = properties["price"]
# converting all text in the price column to lowercase
properties["price"] = properties["price"].str.lower()
# creating a new column to capture auctions
pattern = r"auction"
auctions = properties["price"].str.contains(pattern)
properties["auction"] = auctions.astype(int)
# cleaning
properties["price"] = properties["price"].str.replace("park", "")
properties["price"] = properties["price"].str.replace("k ", "000")
pattern_a = r"\D+"
properties["price"] = properties["price"].str.replace(",", "").str.replace(" ", "")
properties["price"] = properties["price"].str.replace(pattern_a, " ").str.strip()
pattern_b = r"\b(00)\b"
properties["price"] = properties["price"].str.replace(pattern_two, "").str.strip()
properties["price"] = properties["price"].str.replace(r"2020", "").str.strip()
properties["price"] = properties["price"].str.replace(r"\b[0-9]{1,2}\b", "").str.strip()
properties["price"] = properties["price"].str.replace(r"^\s*$", "").str.strip()
pattern_c = r"(^04\d+)"
properties["price"] = properties["price"].str.replace(pattern_c, "").str.strip()
# creating price min and price max columns
properties["price_min"] = properties["price"].str.split(" ").str.get(0)
properties["price_max"] = properties["price"].str.split(" ").str.get(-1)
# converting price min and price max columns to numeric format
properties["price_min"] = pd.to_numeric(properties["price_min"], errors="coerce")
properties["price_max"] = pd.to_numeric(properties["price_max"], errors="coerce")
# dropping original price column
properties.drop(["price"], axis=1, inplace=True)
properties.sample(5)
# creating mean price column
properties["price_mean"] = (properties["price_min"] + properties["price_max"]) / 2
properties.sample(5)
The scraper does not check whether a property already exists in our database - each time it is run, it simply adds all properties it finds to the existing data. This means that each property in the dataset is likely to have multiple duplicate entries.
# shape of dataframe (including duplicates)
properties.shape
# random example of duplication
properties[(properties["clean_street"] == "9 anvil street") & (properties["suburb"] == "GRETA")]
For the purpose of this initial analysis, we will first sort the dataframe by scrape_date in descending order, and then remove duplicates based on the clean_street and suburb columns, keeping only the first entry - which will correspond to the latest record due to the way in which we sort the data.
# sorting the data in descending order
properties = properties.sort_values("scrape_date", ascending=False)
# removing all duplicates and keeping only the first (latest) entry
properties = properties.drop_duplicates(subset=["clean_street", "suburb"])
# resetting index
properties.reset_index(drop=True, inplace=True)
# confirming everything worked as expected using the example we looked at above
# - only the most recent entry (20/07/2020) has been kept
properties[(properties["clean_street"] == "9 anvil street") & (properties["suburb"] == "GRETA")]
# shape of dataframe after removal of duplicates
properties.shape
We have made great progress with our cleaning, however, given the messy scraped dataset we started off with, we expect there to still be issues - data entry related, scraper related, or cleaning related as the methods we applied to create our clean columns might not have worked for all entries.
The price column with its countless variations in which details were entered made it particularly difficult to account for all scenarios. Let's look at the columns relating to price to see if there is anything further we can do to improve data accuracy.
# min for the price_min column
print("Minimum: " + str(properties["price_min"].min()))
# entries with price_min values of less than 50000
properties[properties["price_min"] < 50000]
Looking at the summary of entries with minimum prices of less than $50,000, we observe that some of the low prices seem legitimate (i.e. small areas of vacant land), while others appear to be errors, either caused by data entry or scraping mistakes (i.e. forgetting a zero) or by cleaning errors (i.e. not accounting for decimal prices entered in millions).
How should we proceed?
The easiest option would be to simply remove any entry with minimum pricing below $65,000, but we would lose some legitimate entries and could potentially affect the pricing analysis for certain suburbs such as North Arm Cove, where vacant land appears to be particularly cheap. Another option would be to try and manually correct as many entries as possible, but the small improvements we could make don't justify the time spent on each individual entry - and being a live project with the scraper being run and data being added to the database every few days, this approach will be less and less practical.
For the purpose of this analysis, we find that removing any entries with a minimum price of less than $50,000 that are not vacant land appears to be the best option, however some incorrect entries would remain:
# example available at the time of assembling this project
properties[properties.index == 3672]
Is there anything we can do to further improve our approach?
Looking at our example above, we notice a great discrepancy between the values for minimum and maximum price.
The price_min and price_max columns reflect the pricing guide for each property which is meant to give potential buyers an idea of the price range a listing falls into, meaning the gap between the two values should not be too large.
Looking at our data, we find a few entries where the price maximum is at least 1.5 times as high as the price minimum, and to no great surprise, all of these listings have questionable price_min or price_max entries.
properties[properties["price_max"] - (properties["price_min"] * 1.5) > 0]
We decide to remove any entries with a minimum price of less than $50,000 that are not vacant land, and then to also remove any entries where the price_max value is at least 1.5 times as high as the price minimum.
# dropping properties with price_min < 50000 that are not vacant land
properties.drop(properties[(properties["price_min"] < 50000) & (properties["prop_type"] != "Vacant land")].index, axis=0, inplace=True)
# dropping rows with unreasonable pricing gap
properties.drop(properties[properties["price_max"] - (properties["price_min"] * 1.5) > 0].index, axis=0, inplace=True)
# resetting index
properties.reset_index(drop=True, inplace=True)
properties.shape
Let's next take a look at the price_max column.
# min for the price_max column
print("Minimum: " + str(properties["price_max"].min()))
The minimum for the price_max column (0) is less than the minimum we originally found for the price_min column (190), wich reveals that we must have entries in the dataset where price_max < price_min:
properties[properties["price_max"] < properties["price_min"]]
Looking at the above output, we find that most errors are caused by data entry or scraping mistakes, and only few could be fixed by accounting for additional scenarios in our data cleaning process. As a result, we decide to simply drop all columns where price_max < price_min.
# dropping rows where price_max < price_min
properties.drop(properties[properties["price_max"] < properties["price_min"]].index, axis=0, inplace=True)
# resetting index
properties.reset_index(drop=True, inplace=True)
properties.shape
# max for the price_max column
print("Maximum: " + str(properties["price_max"].max()))
properties[properties["price_max"] > 3400000]
Only one of the prices above raises questionmarks - the entry for 1/41 Denton Park Drive in Rutherford appears to be incorrect and should be $350,000 (instead of \\$3,500,000) as a quick check on the property website confirms:

When we dealt with the questionable minimum values and unrealistic price gaps, the solution we implemented was generic, capable of addressing similar issues in future listings that might be added to the database.
If we were to correct the entry for 1/41 Denton Park Drive, our solution would be specific to this single row in our database, and useless for any other entries. What's more, should our scraper pick up the correct pricing next time it runs, the incorrect entry above would be replaced with the latest record (as per the guidelines we specified for the removal of duplicate lines), and any fixes addressing the line would be useless or could "correct" the listing incorrectly.
At this stage, we decide to leave the entry as is.
# dropping the price_og column
properties.drop("price_og", axis=1, inplace=True)
# date range
print(properties["scrape_date"].min())
print(properties["scrape_date"].max())
# number of suburbs
properties["suburb"].nunique()
# counts of unique values for the bed column
properties["bed"].value_counts().sort_index()
# counts of unique values for the property type column
properties["prop_type"].value_counts()
# min, max and mean property size (m²)
print("Minimum Property Size: " + str(properties["clean_size"].min()))
print("Maximum Property Size: " + str(properties["clean_size"].max()))
print("Mean Property Size: " + str(properties["clean_size"].mean()))
# total number of auctions
properties["auction"].sum()
# percentage of auctions
(properties["auction"].sum() / len(properties)) * 100
# avg bedrooms, size and price per suburb
grouped = properties.groupby("suburb")[["bed", "clean_size", "price_mean"]]
properties_mean_bsp = round(grouped.agg(np.mean))
properties_mean_bsp["properties_per_group"] = grouped.size()
properties_mean_bsp.reset_index(inplace=True)
properties_mean_bsp
# function for looking at certain suburbs only
def return_suburb_statistics(suburb_list):
result = properties_mean_bsp[properties_mean_bsp["suburb"].isin(suburb_list)].sort_values("price_mean", ascending=False)
return result
# closer look at Newcastle and surrounding suburbs
newcastle_suburbs = ["NEWCASTLE", "NEWCASTLE WEST", "NEWCASTLE EAST", "THE HILL", "COOKS HILL",
"BAR BEACH", "THE JUNCTION"]
return_suburb_statistics(newcastle_suburbs)
# closer look at some Lake Macquarie suburbs
lake_mac_suburbs = ["CHARLESTOWN", "GATESHEAD", "KAHIBAH", "WHITEBRIDGE", "DUDLEY", "REDHEAD",
"BENNETTS GREEN","WINDALE", "TINGIRA HEIGHTS", "WARNERS BAY"]
return_suburb_statistics(lake_mac_suburbs)
# statistics per suburb and prop_type
grouped = properties.groupby(["prop_type", "suburb"])[["bed", "clean_size", "price_mean"]]
properties_mean_prop_type = round(grouped.agg(np.mean))
properties_mean_prop_type["properties_per_group"] = grouped.size()
properties_mean_prop_type
# function for looking at certain suburbs and prop_types only
def return_proptype_suburb_statistics(suburb_list, prop_type_list):
result = properties_mean_prop_type[
(properties_mean_prop_type.index.get_level_values("suburb").isin(suburb_list)) &
(properties_mean_prop_type.index.get_level_values("prop_type").isin(prop_type_list))]
return result
prop_types_to_include = ["House", "Apartment / Unit / Flat"]
# closer look at Newcastle and surrounding suburbs
return_proptype_suburb_statistics(newcastle_suburbs, prop_types_to_include)
# closer look at some Lake Macquarie suburbs
return_proptype_suburb_statistics(lake_mac_suburbs, prop_types_to_include)
We used the clean address details to get the geo location data for our properties and stored them in a separate database. Let's load the data into our notebook and take a look before merging it with our properties dataframe.
# geo data
allGeoData = pd.read_sql_table("geodata2", con=db)
allGeoData = allGeoData.drop_duplicates(["street", "suburb"])
allGeoData.drop(["id", "state", "postcode"], axis=1, inplace=True)
allGeoData["lat"] = allGeoData["lat"].astype(float)
allGeoData["lng"] = allGeoData["lng"].astype(float)
allGeoData.sample(5)
# merging the dataframes
merged = pd.merge(properties, allGeoData, left_on=["clean_street", "suburb"],
right_on=["street", "suburb"], how="left")
# dropping the street column as it is basically a duplicate of the clean_street column
merged.drop("street", axis=1, inplace=True)
merged.sample(5)
# confirming the number of rows in merged is equal to the number of rows in properties
merged.shape
Before we can plot our property geo location data on a map, we first need to clean the new columns, then remove any rows that do not have details in the latitude and longitude columns.
# converting "Not Found" values in the formattedname column to NaN
merged["formattedname"] = merged["formattedname"].replace("Not Found", np.nan)
# converting zeros in the lat and lng columns to NaN
merged["lat"] = merged["lat"].replace(0, np.nan)
merged["lng"] = merged["lng"].replace(0, np.nan)
# creating a new dataframe that only contains rows where lat and lng are not NaN
plot_data = merged[merged["lat"].notnull()]
We cleaned and combined our datasets and removed any properties for which latitude and longitude details were unavailable. Next, we will use the resulting dataframe to create our map.
m = folium.Map(location=[plot_data["lat"].mean(), plot_data["lng"].mean()], zoom_start=9, tiles='Stamen Toner')
# creating the colour legend
import branca
colours = ['#14fff7', '#102aeb', '#77e610', '#faf219', '#fcac0a', '#eb4034']
colormap = branca.colormap.StepColormap(colors=colours, vmin=0, vmax=1200000, caption="Price")
colormap.add_to(m);
def colours_and_circles(data, feature_group):
for _, row in data.iterrows():
# using different colours for different price ranges - black if the price is NaN
if pd.isnull(row["price_mean"]):
colour = '#000000'
elif row["price_mean"] > 1000000:
colour = '#eb4034'
elif (row["price_mean"] <= 1000000) & (row["price_mean"] > 800000):
colour = '#fcac0a'
elif (row["price_mean"] <= 800000) & (row["price_mean"] > 600000):
colour = '#faf219'
elif (row["price_mean"] <= 600000) & (row["price_mean"] > 400000):
colour = '#77e610'
elif (row["price_mean"] <= 400000) & (row["price_mean"] > 200000):
colour = '#102aeb'
else:
colour = '#14fff7'
# setting different radius values based on the m² size of the property
if pd.isnull(row["clean_size"]):
r = 2
elif row["clean_size"] >= 5000:
r = 16
elif (row["clean_size"] < 5000) & (row["clean_size"] >= 2500):
r = 14
elif (row["clean_size"] < 2500) & (row["clean_size"] >= 1800):
r = 12
elif (row["clean_size"] < 1800) & (row["clean_size"] >= 1000):
r = 10
elif (row["clean_size"] < 1000) & (row["clean_size"] >= 750):
r = 7
elif (row["clean_size"] < 750) & (row["clean_size"] >= 500):
r = 5
else:
r = 2
folium.CircleMarker(
location=[row["lat"], row["lng"]],
radius=r,
# adding a popup label to each data point with the address, property type,
# number of bedrooms, size in m² and the estimated price
popup=folium.Popup(str(row["clean_street"]).title() + "<br>" + str(row["suburb"]) + " " + str(row["state"]) + " " + str(row["postcode"]) + "<br>Property Type: " + str(row["prop_type"]) + "<br>Bedrooms: " + str(row["bed"]) + "<br>Size: " + str(row["clean_size"]) + "<br>Estimated Price: " + str(row["price_mean"]), max_width=250),
color=colour,
fill=True,
fill_color=colour
).add_to(feature_group)
# creating different layers for our map so we can filter which properties we want to see based
# on number of bedrooms
feature_group_0 = folium.FeatureGroup(name="unknown number of bedrooms")
unknown_bedrooms = plot_data[pd.isnull(plot_data["bed"])]
colours_and_circles(unknown_bedrooms, feature_group_0)
feature_group_1 = folium.FeatureGroup(name="less than two bedrooms")
few_bedrooms = plot_data[plot_data["bed"] < 2]
colours_and_circles(few_bedrooms, feature_group_1)
feature_group_2 = folium.FeatureGroup(name="two bedrooms")
two_bedrooms = plot_data[plot_data["bed"] == 2]
colours_and_circles(two_bedrooms, feature_group_2)
feature_group_3 = folium.FeatureGroup(name="three bedrooms")
three_bedrooms = plot_data[plot_data["bed"] == 3]
colours_and_circles(three_bedrooms, feature_group_3)
feature_group_4 = folium.FeatureGroup(name="four bedrooms")
four_bedrooms = plot_data[plot_data["bed"] == 4]
colours_and_circles(four_bedrooms, feature_group_4)
feature_group_5 = folium.FeatureGroup(name="five bedrooms")
five_bedrooms = plot_data[plot_data["bed"] == 5]
colours_and_circles(five_bedrooms, feature_group_5)
feature_group_6 = folium.FeatureGroup(name="more than five bedrooms")
more_bedrooms = plot_data[plot_data["bed"] > 5]
colours_and_circles(more_bedrooms, feature_group_6)
feature_group_0.add_to(m)
feature_group_1.add_to(m)
feature_group_2.add_to(m)
feature_group_3.add_to(m)
feature_group_4.add_to(m)
feature_group_5.add_to(m)
feature_group_6.add_to(m)
folium.LayerControl().add_to(m)
m